######################################################################
### Fig 6: Words with Top 30 Loadings
######################################################################
rm(list=ls())

## library(RWeka) # for bigram tokenizer
library(tm)
library(SnowballC)
library(glmnet)
library(wordcloud)
library(maptpx)

## directories
FIG_DIR <- getwd()

## loading trade bills
bills <- read.csv("./tradebills.txt",
                  sep="\t", head=TRUE, quote="")

## removing bills before 110th
idx.110 <- which(substring(bills$bill, 1,3) %in% c("106", "107", "108", "109"))
if(length(idx.110)>0){
    bills <- bills[-idx.110,]
}

## sort by lobbied or not
bills <- bills[order(bills$lobbied, decreasing=T),]
head(bills)[,1:3]
tail(bills)[,1:3]

## -------------------------------------------------------------------
## clean-up text  ----------------------------------------------------
## -------------------------------------------------------------------

Corpus <- Corpus(VectorSource(as.character(bills$summary)))

## Clean Corpus by removing white space, numbers, and punctuation
Corpus.clean <- tm_map(Corpus, content_transformer(stripWhitespace))
Corpus.clean <- tm_map(Corpus, content_transformer(removeNumbers))
Corpus.clean <- tm_map(Corpus.clean, content_transformer(removePunctuation))
Corpus.clean <- tm_map(Corpus.clean, content_transformer(tolower))
Corpus.clean <- tm_map(Corpus.clean, removeWords, stopwords("english"))

newtext <- lapply(Corpus.clean, function(x) gsub("[0-9]{1}", " ", x))
newtext <- lapply(newtext, function(x) gsub(" +", " ", x))

corpus <- Corpus(VectorSource(unlist(newtext)))

options(mc.cores=1)

BigramTokenizer  <- function(x) {
    RWeka::NGramTokenizer(x, RWeka::Weka_control(min = 1, max = 1))
}

dtm.control <- list(
    tokenize = BigramTokenizer,
    bounds=list(global=c(10, Inf)),
    removePunctuation=T,
    stopwords = stopwords("english"),
    stemming = T,
    tolower = T, 
    wordLengths = c(3, Inf))


bills_dtm <- DocumentTermMatrix(corpus, control = dtm.control)
dim(bills_dtm) 

dtm <- bills_dtm
rowTotals <- apply(dtm , 1, sum) # Find the sum of words in each Document
dtm   <- dtm[rowTotals> 0, ] # remove all docs without words
dim(dtm)

## indicator for lobbied
lobbied <- ifelse(bills$lobbied==1, 1, 0)
lobbied <- lobbied[rowTotals>0]
sum(lobbied)/length(lobbied)
dim(dtm)
length(lobbied)


tfidf_remove <- FALSE
if(tfidf_remove){
    term_tfidf <- tapply(dtm$v/row_sums(dtm)[dtm$i], dtm$j, mean) *
        log2(nDocs(dtm)/col_sums(dtm > 0))

    summary(term_tfidf)
    tmp.idx <- which(term_tfidf >= as.numeric(summary(term_tfidf)[5]))
    dtm <- dtm[, tmp.idx]
    summary(col_sums(dtm))
    dim(dtm)
}


dtm <- as.matrix(dtm)


## -------------------------------------------------------------------
## LASSO on text     -------------------------------------------------
## -------------------------------------------------------------------


set.seed(1234)
parallel <- TRUE
if(parallel){
    library(doParallel)
    registerDoParallel(10)

    ## cross-validation glmnet
    glmnet.i <- cv.glmnet(dtm, lobbied,
                          family="binomial", nfolds=10,
                          parallel=TRUE)
} else {
    glmnet.i <- cv.glmnet(dtm, lobbied,
                          family="binomial", nfolds=10)
}

beta <- as.matrix(coef(glmnet.i))
beta <- beta[-1,] # removing intercept

beta_ordered <- beta[order(beta)]

more <- sort(beta[which(beta>0)], decreasing=T)
more <- more[1:min(30,length(more))]

less <- sort(beta[which(beta<0)])
less <- less[1:min(30,length(less))]

pdf(file = file.path(FIG_DIR, "Figure6.pdf"))

par(cex=.8, mar=c(1,1,1,1))
plot(rep(1,length(more)), seq(from=1800, to=1, length.out=length(more)),type="n",
     xlim=c(0,3), ylim=c(0,2000),
     xaxt='n', yaxt='n',
     xlab="", ylab="",
     bty='n')
yloc <- seq(from=1800, to=1, length.out=min(30,length(more)))
text(rep(1,length(more)), yloc, names(more),
     cex=.5*sqrt(more/(more[length(more)])),
     col="darkgreen")
text(rep(2,length(less)), seq(from=30, to=1830, length.out=min(30,length(less))), names(less),
     col="red")

text(1,1970, "More Lobby", cex=2)
rect(0.6,1895,1.4,1910,col = rgb(0.5,0.5,0.5,1/4))

text(2,1970, "Less Lobby", cex=2)
rect(1.6,1895,2.4,1910,col = rgb(0.5,0.5,0.5,1/4))

dev.off()



